
########################################################################################################
# Note: The cosine similarity computations could not be done in one go, due to limited working memory. #
#       Hence, they are split in several chunks (no looping). The resulting output files are combined  #
#       via Stata when compiling the main analysis file ("fb tw micro data w vars.dta").               #
########################################################################################################



library(haven)
library(quanteda)
library(data.table)
library(stringr)
library(dplyr)
library(tidyr)

# set path
setwd(".../replication kit/intermediate files and code")


################ part 1
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-01-01" & data$date <= "2016-01-05"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part1 <- output[!duplicated(output), ]

write_dta(output_part1, "max cosine files/max_cosine_1.dta", version = 14)




################ part 2
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-01-06" & data$date <= "2016-01-10"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part2 <- output[!duplicated(output), ]

write_dta(output_part2, "max cosine files/max_cosine_2.dta", version = 14)







################ part 3
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-01-11" & data$date <= "2016-01-15"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part3 <- output[!duplicated(output), ]

write_dta(output_part3, "max cosine files/max_cosine_3.dta", version = 14)





################ part 4
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-01-16" & data$date <= "2016-01-20"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part4 <- output[!duplicated(output), ]

write_dta(output_part4, "max cosine files/max_cosine_4.dta", version = 14)





################ part 5
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-01-21" & data$date <= "2016-01-25"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part5 <- output[!duplicated(output), ]

write_dta(output_part5, "max cosine files/max_cosine_5.dta", version = 14)





################ part 6
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-01-26" & data$date <= "2016-01-30"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part6 <- output[!duplicated(output), ]

write_dta(output_part6, "max cosine files/max_cosine_6.dta", version = 14)





################ part 7
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-01-31" & data$date <= "2016-02-04"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part7 <- output[!duplicated(output), ]

write_dta(output_part7, "max cosine files/max_cosine_7.dta", version = 14)





################ part 8
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-02-05" & data$date <= "2016-02-09"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part8 <- output[!duplicated(output), ]

write_dta(output_part8, "max cosine files/max_cosine_8.dta", version = 14)





################ part 9
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-02-10" & data$date <= "2016-02-14"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part9 <- output[!duplicated(output), ]

write_dta(output_part9, "max cosine files/max_cosine_9.dta", version = 14)





################ part 10
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-02-15" & data$date <= "2016-02-19"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part10 <- output[!duplicated(output), ]

write_dta(output_part10, "max cosine files/max_cosine_10.dta", version = 14)





################ part 11
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-02-20" & data$date <= "2016-02-24"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part11 <- output[!duplicated(output), ]

write_dta(output_part11, "max cosine files/max_cosine_11.dta", version = 14)





################ part 12
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-02-25" & data$date <= "2016-02-29"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part12 <- output[!duplicated(output), ]

write_dta(output_part12, "max cosine files/max_cosine_12.dta", version = 14)





################ part 13
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-03-01" & data$date <= "2016-03-05"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part13 <- output[!duplicated(output), ]

write_dta(output_part13, "max cosine files/max_cosine_13.dta", version = 14)





################ part 14
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-03-06" & data$date <= "2016-03-10"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part14 <- output[!duplicated(output), ]

write_dta(output_part14, "max cosine files/max_cosine_14.dta", version = 14)






################ part 15
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-03-11" & data$date <= "2016-03-15"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part15 <- output[!duplicated(output), ]

write_dta(output_part15, "max cosine files/max_cosine_15.dta", version = 14)





################ part 16
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-03-16" & data$date <= "2016-03-20"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part16 <- output[!duplicated(output), ]

write_dta(output_part16, "max cosine files/max_cosine_16.dta", version = 14)





################ part 17
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-03-21" & data$date <= "2016-03-25"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part17 <- output[!duplicated(output), ]

write_dta(output_part17, "max cosine files/max_cosine_17.dta", version = 14)





################ part 18
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-03-26" & data$date <= "2016-03-30"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part18 <- output[!duplicated(output), ]

write_dta(output_part18, "max cosine files/max_cosine_18.dta", version = 14)




################ part 19
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-03-31" & data$date <= "2016-04-04"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part19 <- output[!duplicated(output), ]

write_dta(output_part19, "max cosine files/max_cosine_19.dta", version = 14)




################ part 20
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-04-05" & data$date <= "2016-04-09"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part20 <- output[!duplicated(output), ]

write_dta(output_part20, "max cosine files/max_cosine_20.dta", version = 14)






################ part 21
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-04-10" & data$date <= "2016-04-14"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part21 <- output[!duplicated(output), ]

write_dta(output_part21, "max cosine files/max_cosine_21.dta", version = 14)







################ part 22
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-04-15" & data$date <= "2016-04-19"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part22 <- output[!duplicated(output), ]

write_dta(output_part22, "max cosine files/max_cosine_22.dta", version = 14)







################ part 23
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-04-20" & data$date <= "2016-04-24"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part23 <- output[!duplicated(output), ]

write_dta(output_part23, "max cosine files/max_cosine_23.dta", version = 14)






################ part 24
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-04-25" & data$date <= "2016-04-29"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part24 <- output[!duplicated(output), ]

write_dta(output_part24, "max cosine files/max_cosine_24.dta", version = 14)






################ part 25
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-04-30" & data$date <= "2016-05-04"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part25 <- output[!duplicated(output), ]

write_dta(output_part25, "max cosine files/max_cosine_25.dta", version = 14)






################ part 26
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-05-05" & data$date <= "2016-05-09"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part26 <- output[!duplicated(output), ]

write_dta(output_part26, "max cosine files/max_cosine_26.dta", version = 14)







################ part 27
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-05-10" & data$date <= "2016-05-14"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part27 <- output[!duplicated(output), ]

write_dta(output_part27, "max cosine files/max_cosine_27.dta", version = 14)







################ part 28
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-05-15" & data$date <= "2016-05-19"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part28 <- output[!duplicated(output), ]

write_dta(output_part28, "max cosine files/max_cosine_28.dta", version = 14)







################ part 29
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-05-20" & data$date <= "2016-05-24"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part29 <- output[!duplicated(output), ]

write_dta(output_part29, "max cosine files/max_cosine_29.dta", version = 14)







################ part 30
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-05-25" & data$date <= "2016-05-29"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part30 <- output[!duplicated(output), ]

write_dta(output_part30, "max cosine files/max_cosine_30.dta", version = 14)








################ part 31
rm(list = ls())
data <- read_dta("fb tw micro data.dta", col_select = c(id, text, date, outlet, platform))
data <- as.data.table(data)

data2 <- data[data$date >= "2016-05-30" & data$date <= "2016-05-31"]

data2 <- as.data.table(data2 %>% group_by(date, outlet) %>% mutate(group_id = cur_group_id()))

docid <- paste(data2$group_id, data2$platform, data2$id, sep = "|")

corpus2 <- corpus(data2$text)
docnames(corpus2) <- docid

tokens <- tokens(corpus2, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, 
                  remove_url = TRUE, remove_separators = TRUE)
tokens <- tokens_tolower(tokens)
tokens <- tokens_wordstem(tokens, language = "german")

dfm <- dfm(tokens, tolower = TRUE, remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

sim <- textstat_simil(dfm, margin = "documents", method = "cosine")
sim <- as.data.frame(sim)

sim$document1 <- as.character(sim$document1)
sim$document2 <- as.character(sim$document2)

check <- data.frame(do.call('rbind', strsplit((sim$document1),"|",fixed=TRUE)))
check$X1 <- as.character(check$X1)
check$X2 <- as.character(check$X2)
check$X3 <- as.character(check$X3)
colnames(check)[1] <- "check_group"
colnames(check)[2] <- "check_platform"
colnames(check)[3] <- "check_id"

checked <- data.frame(do.call('rbind', strsplit((sim$document2),"|",fixed=TRUE)))
checked$X1 <- as.character(checked$X1)
checked$X2 <- as.character(checked$X2)
checked$X3 <- as.character(checked$X3)
colnames(checked)[1] <- "checked_group"
colnames(checked)[2] <- "checked_platform"
colnames(checked)[3] <- "checked_id"

sim2 <- cbind(check, checked, sim$cosine)
colnames(sim2)[7] <- "cosine"

output <- subset(sim2, check_group==checked_group & check_platform!=checked_platform)
output <- as.data.frame(output %>% group_by(check_id, checked_group) %>% mutate(max = max(cosine)))
output$check_group <- NULL
output$check_platform <- NULL
output$checked_group <- NULL
output$checked_platform <- NULL
output$checked_id <- NULL
output$cosine <- NULL
colnames(output)[1] <- "id"
colnames(output)[2] <- "max_cosine"

output_part31 <- output[!duplicated(output), ]

write_dta(output_part31, "max cosine files/max_cosine_31.dta", version = 14)








